@inproceedings{Vaswani2017AttentionIA,
  title={Attention is All you Need},
  author={Ashish Vaswani and Noam M. Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
  booktitle={Neural Information Processing Systems},
  year={2017},
  url={https://api.semanticscholar.org/CorpusID:13756489}
}
@article{Bahdanau2014NeuralMT,
  title={Neural Machine Translation by Jointly Learning to Align and Translate},
  author={Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio},
  journal={CoRR},
  year={2014},
  volume={abs/1409.0473},
  url={https://api.semanticscholar.org/CorpusID:11212020}
}
# Preferences General ############################################################
@article{lambert2023entangled,
  title={Entangled preferences: The history and risks of reinforcement learning and human feedback},
  author={Lambert, Nathan and Gilbert, Thomas Krendl and Zick, Tom},
  journal={arXiv preprint arXiv:2310.13595},
  year={2023}
}

@article{wirth2017survey,
  title={A survey of preference-based reinforcement learning methods},
  author={Wirth, Christian and Akrour, Riad and Neumann, Gerhard and F{\"u}rnkranz, Johannes},
  journal={Journal of Machine Learning Research},
  volume={18},
  number={136},
  pages={1--46},
  year={2017}
}
################################################################################################

# pref history
# utility to go formulation 
@techreport{widrow1960adaptive,
  title={Adaptive switching circuits},
  author={Widrow, Bernard and Hoff, Marcian E},
  year={1960},
  institution={Stanford Univ Ca Stanford Electronics Labs}
}
@book{skinner2019behavior,
  title={The behavior of organisms: An experimental analysis},
  author={Skinner, Burrhus Frederic},
  year={2019},
  publisher={BF Skinner Foundation}
}
@article{thorndike1927law,
  title={The law of effect},
  author={Thorndike, Edward L},
  journal={The American journal of psychology},
  volume={39},
  number={1/4},
  pages={212--222},
  year={1927},
  publisher={JSTOR}
}
# original notion of "what one must do" 
@book{arnauld1861port,
  title={The Port-Royal Logic},
  author={Arnauld, Antoine},
  year={1662},
}

# original notion of "what one must do" 
@book{bentham1823hedonic,
  title={An Introduction to the Principles of Morals and Legislation},
  author={Bentham, Jeremy},
  year={1823},
}

@article{von1947theory,
  title={Theory of games and economic behavior, 2nd rev},
  author={Von Neumann, John and Morgenstern, Oskar},
  year={1947},
  publisher={Princeton university press}
}
@article{ramsey2016truth,
  title={Truth and probability},
  author={Ramsey, Frank P},
  journal={Readings in Formal Epistemology: Sourcebook},
  pages={21--45},
  year={2016},
  publisher={Springer}
}

@article{arrow1950difficulty,
  title={A difficulty in the concept of social welfare},
  author={Arrow, Kenneth J},
  journal={Journal of political economy},
  volume={58},
  number={4},
  pages={328--346},
  year={1950},
  publisher={The University of Chicago Press}
}
@article{harsanyi1977rule,
  title={Rule utilitarianism and decision theory},
  author={Harsanyi, John C},
  journal={Erkenntnis},
  volume={11},
  number={1},
  pages={25--53},
  year={1977},
  publisher={Springer}
}
@book{pettigrew2019choosing,
  title={Choosing for changing selves},
  author={Pettigrew, Richard},
  year={2019},
  publisher={Oxford University Press}
}
@inproceedings{soares2015corrigibility,
  title={Corrigibility},
  author={Soares, Nate and Fallenstein, Benja and Armstrong, Stuart and Yudkowsky, Eliezer},
  booktitle={Workshops at the twenty-ninth AAAI conference on artificial intelligence},
  year={2015}
}
############################################################################################

# AI General ####################################################################
@book{russell2016artificial,
  title={Artificial intelligence: a modern approach},
  author={Russell, Stuart J and Norvig, Peter},
  year={2016},
  publisher={Pearson}
}

################################################################################################

# RL related lit
@article{williams1992simple,
  title={Simple statistical gradient-following algorithms for connectionist reinforcement learning},
  author={Williams, Ronald J},
  journal={Machine learning},
  volume={8},
  pages={229--256},
  year={1992},
  publisher={Springer}
}
@article{schulman2017proximal,
  title={Proximal policy optimization algorithms},
  author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
  journal={arXiv preprint arXiv:1707.06347},
  year={2017}
}
@article{berner2019dota,
  title={Dota 2 with large scale deep reinforcement learning},
  author={Berner, Christopher and Brockman, Greg and Chan, Brooke and Cheung, Vicki and D{\k{e}}biak, Przemys{\l}aw and Dennison, Christy and Farhi, David and Fischer, Quirin and Hashme, Shariq and Hesse, Chris and others},
  journal={arXiv preprint arXiv:1912.06680},
  year={2019}
}
@inproceedings{huang2024n,
  title={The N+ Implementation Details of {RLHF} with {PPO}: A Case Study on {TL};{DR} Summarization},
  author={Shengyi Huang and Michael Noukhovitch and Arian Hosseini and Kashif Rasul and Weixun Wang and Lewis Tunstall},
  booktitle={First Conference on Language Modeling},
  year={2024},
  url={https://openreview.net/forum?id=kHO2ZTa8e3}
}
@article{weng2018PG,
  title   = "Policy Gradient Algorithms",
  author  = "Weng, Lilian",
  journal = "lilianweng.github.io",
  year    = "2018",
  url     = "https://lilianweng.github.io/posts/2018-04-08-policy-gradient/"
}
@misc{achiam2018spinning,
  title={Spinning up in deep reinforcement learning},
  author={Achiam, Joshua},
  year={2018}
}
@article{kool2019buy,
  title={Buy 4 reinforce samples, get a baseline for free!},
  author={Kool, Wouter and van Hoof, Herke and Welling, Max},
  year={2019}
}
@inproceedings{knox2008tamer,
  title={Tamer: Training an agent manually via evaluative reinforcement},
  author={Knox, W Bradley and Stone, Peter},
  booktitle={2008 7th IEEE international conference on development and learning},
  pages={292--297},
  year={2008},
  organization={IEEE}
}
@inproceedings{macglashan2017interactive,
  title={Interactive learning from policy-dependent human feedback},
  author={MacGlashan, James and Ho, Mark K and Loftin, Robert and Peng, Bei and Wang, Guan and Roberts, David L and Taylor, Matthew E and Littman, Michael L},
  booktitle={International conference on machine learning},
  pages={2285--2294},
  year={2017},
  organization={PMLR}
}
@inproceedings{warnell2018deep,
  title={Deep tamer: Interactive agent shaping in high-dimensional state spaces},
  author={Warnell, Garrett and Waytowich, Nicholas and Lawhern, Vernon and Stone, Peter},
  booktitle={Proceedings of the AAAI conference on artificial intelligence},
  volume={32},
  number={1},
  year={2018}
}

@article{kaufmann2023survey,
  title={A survey of reinforcement learning from human feedback},
  author={Kaufmann, Timo and Weng, Paul and Bengs, Viktor and H{\"u}llermeier, Eyke},
  journal={arXiv preprint arXiv:2312.14925},
  year={2023}
}
@article{sutton2018reinforcement,
  title={Reinforcement learning: An introduction},
  author={Sutton, Richard S},
  journal={A Bradford Book},
  year={2018}
}
@inproceedings{ng2000algorithms,
  title={Algorithms for inverse reinforcement learning.},
  author={Ng, Andrew Y and Russell, Stuart and others},
  booktitle = {Proceedings of the Seventeenth International Conference on Machine Learning},
  pages = {663–-670},
  numpages = {8},
  series = {ICML '00},
  year={2000}
}
@inproceedings{schulman2015high,
  title={High-dimensional continuous control using generalized advantage estimation},
  author={Schulman, John and Moritz, Philipp and Levine, Sergey and Jordan, Michael and Abbeel, Pieter},
  booktitle={Proceedings of the International Conference on Learning Representations (ICLR)},
  year =2016
}
@misc{seita2017gae,
  author = {Daniel Seita},
  title = {Notes on the Generalized Advantage Estimation Paper},
  year = {2017},
  url = {https://danieltakeshi.github.io/2017/04/02/notes-on-the-generalized-advantage-estimation-paper/}
}
@article{lambert2020objective,
  title={Objective mismatch in model-based reinforcement learning},
  author={Lambert, Nathan and Amos, Brandon and Yadan, Omry and Calandra, Roberto},
  journal={arXiv preprint arXiv:2002.04523},
  year={2020}
}

@article{lambert2023alignment,
  title={The alignment ceiling: Objective mismatch in reinforcement learning from human feedback},
  author={Lambert, Nathan and Calandra, Roberto},
  journal={arXiv preprint arXiv:2311.00168},
  year={2023}
}
@inproceedings{sharma2023towards,
  title={Towards Understanding Sycophancy in Language Models},
  author={Mrinank Sharma and Meg Tong and Tomasz Korbak and David Duvenaud and Amanda Askell and Samuel R. Bowman and Esin DURMUS and Zac Hatfield-Dodds and Scott R Johnston and Shauna M Kravec and Timothy Maxwell and Sam McCandlish and Kamal Ndousse and Oliver Rausch and Nicholas Schiefer and Da Yan and Miranda Zhang and Ethan Perez},
  booktitle={The Twelfth International Conference on Learning Representations},
  year={2024},
  url={https://openreview.net/forum?id=tvhaxkMKAn}
}
# models & history ####################################################################
@article{brown2020language,
  title={Language models are few-shot learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal={Advances in neural information processing systems},
  volume={33},
  pages={1877--1901},
  year={2020}
}
@article{raffel2020exploring,
  title={Exploring the limits of transfer learning with a unified text-to-text transformer},
  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
  journal={Journal of machine learning research},
  volume={21},
  number={140},
  pages={1--67},
  year={2020}
}
@inproceedings{wei2021finetuned,
  title={Finetuned Language Models are Zero-Shot Learners},
  author={Jason Wei and Maarten Bosma and Vincent Zhao and Kelvin Guu and Adams Wei Yu and Brian Lester and Nan Du and Andrew M. Dai and Quoc V Le},
  booktitle={International Conference on Learning Representations},
  year={2022},
  url={https://openreview.net/forum?id=gEZrGCozdqR}
}
@inproceedings{sanh2021multitask,
  title={Multitask Prompted Training Enables Zero-Shot Task Generalization},
  author={Victor Sanh and Albert Webson and Colin Raffel and Stephen Bach and Lintang Sutawika and Zaid Alyafeai and Antoine Chaffin and Arnaud Stiegler and Arun Raja and Manan Dey and M Saiful Bari and Canwen Xu and Urmish Thakker and Shanya Sharma Sharma and Eliza Szczechla and Taewoon Kim and Gunjan Chhablani and Nihal Nayak and Debajyoti Datta and Jonathan Chang and Mike Tian-Jian Jiang and Han Wang and Matteo Manica and Sheng Shen and Zheng Xin Yong and Harshit Pandey and Rachel Bawden and Thomas Wang and Trishala Neeraj and Jos Rozen and Abheesht Sharma and Andrea Santilli and Thibault Fevry and Jason Alan Fries and Ryan Teehan and Teven Le Scao and Stella Biderman and Leo Gao and Thomas Wolf and Alexander M Rush},
  booktitle={International Conference on Learning Representations},
  year={2022},
  url={https://openreview.net/forum?id=9Vrb9D0WI4}
}
@inproceedings{mishra2021cross,
    title = "Cross-Task Generalization via Natural Language Crowdsourcing Instructions",
    author = "Mishra, Swaroop  and
      Khashabi, Daniel  and
      Baral, Chitta  and
      Hajishirzi, Hannaneh",
    booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = may,
    year = "2022",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.acl-long.244/",
    doi = "10.18653/v1/2022.acl-long.244",
    pages = "3470--3487",
}
@misc{alpaca,
  author = {Rohan Taori and Ishaan Gulrajani and Tianyi Zhang and Yann Dubois and Xuechen Li and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },
  title = {Stanford Alpaca: An Instruction-following LLaMA model},
  year = {2023},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/tatsu-lab/stanford_alpaca}},
}
@misc{vicuna2023,
    title = {Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90\%* ChatGPT Quality},
    url = {https://lmsys.org/blog/2023-03-30-vicuna/},
    author = {Chiang, Wei-Lin and Li, Zhuohan and Lin, Zi and Sheng, Ying and Wu, Zhanghao and Zhang, Hao and Zheng, Lianmin and Zhuang, Siyuan and Zhuang, Yonghao and Gonzalez, Joseph E. and Stoica, Ion and Xing, Eric P.},
    month = {March},
    year = {2023}
}
@misc{koala_blogpost_2023,
  author = {Xinyang Geng and Arnav Gudibande and Hao Liu and Eric Wallace and Pieter Abbeel and Sergey Levine and Dawn Song},
  title = {Koala: A Dialogue Model for Academic Research},
  howpublished = {Blog post},
  month = {April},
  year = {2023},
  url = {https://bair.berkeley.edu/blog/2023/04/03/koala/},
  urldate = {2023-04-03}
}
@online{DatabricksBlog2023DollyV1,
    author    = {Mike Conover and Matt Hayes and Ankit Mathur and Xiangrui Meng and Jianwei Xie and Jun Wan and Ali Ghodsi and Patrick Wendell and Matei Zaharia},
    title     = {Hello Dolly: Democratizing the magic of ChatGPT with open models},
    year      = {2023},
    url       = {https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html},
    urldate   = {2023-06-30}
}
@inproceedings{tunstall2023zephyr,
  title={Zephyr: Direct Distillation of {LM} Alignment},
  author={Lewis Tunstall and Edward Emanuel Beeching and Nathan Lambert and Nazneen Rajani and Kashif Rasul and Younes Belkada and Shengyi Huang and Leandro Von Werra and Cl{\'e}mentine Fourrier and Nathan Habib and Nathan Sarrazin and Omar Sanseviero and Alexander M Rush and Thomas Wolf},
  booktitle={First Conference on Language Modeling},
  year={2024},
  url={https://openreview.net/forum?id=aKkAwZB6JV}
}
@article{zhou2023lima,
  title={Lima: Less is more for alignment},
  author={Zhou, Chunting and Liu, Pengfei and Xu, Puxin and Iyer, Srinivasan and Sun, Jiao and Mao, Yuning and Ma, Xuezhe and Efrat, Avia and Yu, Ping and Yu, Lili and others},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={55006--55021},
  year={2023}
}
@article{wang2022self,
  title={Self-instruct: Aligning language models with self-generated instructions},
  author={Wang, Yizhong and Kordi, Yeganeh and Mishra, Swaroop and Liu, Alisa and Smith, Noah A and Khashabi, Daniel and Hajishirzi, Hannaneh},
  journal={arXiv preprint arXiv:2212.10560},
  year={2022}
}
@article{dettmers2023qlora,
  title={Qlora: Efficient finetuning of quantized llms},
  author={Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke},
  journal={Advances in neural information processing systems},
  volume={36},
  pages={10088--10115},
  year={2023}
}
@article{ivison2023camels,
  title={Camels in a changing climate: Enhancing lm adaptation with tulu 2},
  author={Ivison, Hamish and Wang, Yizhong and Pyatkin, Valentina and Lambert, Nathan and Peters, Matthew and Dasigi, Pradeep and Jang, Joel and Wadden, David and Smith, Noah A and Beltagy, Iz and others},
  journal={arXiv preprint arXiv:2311.10702},
  year={2023}
}
@article{ivison2024unpacking,
  title={Unpacking DPO and PPO: Disentangling Best Practices for Learning from Preference Feedback},
  author={Ivison, Hamish and Wang, Yizhong and Liu, Jiacheng and Wu, Zeqiu and Pyatkin, Valentina and Lambert, Nathan and Smith, Noah A and Choi, Yejin and Hajishirzi, Hannaneh},
  journal={arXiv preprint arXiv:2406.09279},
  year={2024}
}
@article{xu2024dpo,
  title={Is dpo superior to ppo for llm alignment? a comprehensive study},
  author={Xu, Shusheng and Fu, Wei and Gao, Jiaxuan and Ye, Wenjie and Liu, Weilin and Mei, Zhiyu and Wang, Guangju and Yu, Chao and Wu, Yi},
  journal={arXiv preprint arXiv:2404.10719},
  year={2024}
}
@article{cui2023ultrafeedback,
  title={Ultrafeedback: Boosting language models with high-quality feedback},
  author={Cui, Ganqu and Yuan, Lifan and Ding, Ning and Yao, Guanming and Zhu, Wei and Ni, Yuan and Xie, Guotong and Liu, Zhiyuan and Sun, Maosong},
  year={2023}
}
@article{cohere2025command,
  title={Command A: An Enterprise-Ready Large Language Model},
  author={Cohere, Team and Ahmadian, Arash and Ahmed, Marwan and Alammar, Jay and Alnumay, Yazeed and Althammer, Sophia and Arkhangorodsky, Arkady and Aryabumi, Viraat and Aumiller, Dennis and Avalos, Rapha{\"e}l and others},
  journal={arXiv preprint arXiv:2504.00698},
  year={2025}
}
@misc{numina_math_7b,
  author = {Edward Beeching and Shengyi Costa Huang and Albert Jiang and Jia Li and Benjamin Lipkin and Zihan Qina and Kashif Rasul and Ziju Shen and Roman Soletskyi and Lewis Tunstall},
  title = {NuminaMath 7B TIR},
  year = {2024},
  publisher = {Numina & Hugging Face},
  journal = {Hugging Face repository},
  howpublished = {\url{https://huggingface.co/AI-MO/NuminaMath-7B-TIR}}
}
@misc{yu2025dapo,
  author = {Yu, Qiying and Zhang, Zheng and Zhu, Ruofei and Yuan, Yufeng and Yue, Yu and Fan, Tiantian and Liu, Gaohong and Liu, Lingjun and Liu, Xin and Lin, Haibin and Lin, Zhiqi and Ma, Bole and Sheng, Guangming and Tong, Yuxuan and Zhang, Chi and Zhang, Mofan and Zhang, Wang and Zhu, Hang and Zhu, Jinhua and Chen, Jiaze and Chen, Jiangjie and Wang, Chengyi and Yu, Hongli and Dai, Weinan and Song, Yuxuan and Wei, Xiangpeng and Zhou, Hao and Liu, Jingjing and Ma, Wei-Ying and Zhang, Ya-Qin and Qiao, Mu and Yan, Lin and Wu, Yonghui and Wang, Mingxuan},
  title = {DAPO: an Open-Source LLM Reinforcement Learning System at Scale},
  year = {2025},
  month = {March},
  note = {Available at \url{https://dapo-sia.github.io/static/pdf/dapo_paper.pdf}}
}
@article{baheti2023leftover,
  title={Leftover lunch: advantage-based offline reinforcement learning for language models},
  author={Baheti, Ashutosh and Lu, Ximing and Brahman, Faeze and Bras, Ronan Le and Sap, Maarten and Riedl, Mark},
  journal={arXiv preprint arXiv:2305.14718},
  year={2023}
}
@article{yuan2025vapo,
  title={VAPO: Efficient and Reliable Reinforcement Learning for Advanced Reasoning Tasks},
  author={Yuan, Yufeng and Yu, Qiying and Zuo, Xiaochen and Zhu, Ruofei and Xu, Wenyuan and Chen, Jiaze and Wang, Chengyi and Fan, TianTian and Du, Zhengyin and Wei, Xiangpeng and others},
  journal={arXiv preprint arXiv:2504.05118},
  year={2025}
}
@article{yuan2025s,
  title={What's Behind PPO's Collapse in Long-CoT? Value Optimization Holds the Secret},
  author={Yuan, Yufeng and Yue, Yu and Zhu, Ruofei and Fan, Tiantian and Yan, Lin},
  journal={arXiv preprint arXiv:2503.01491},
  year={2025}
}
@article{liu2025understanding,
  title={Understanding R1-Zero-Like Training: A Critical Perspective},
  author={Liu, Zichen and Chen, Changyu and Li, Wenjun and Qi, Penghui and Pang, Tianyu and Du, Chao and Lee, Wee Sun and Lin, Min},
  journal={arXiv preprint arXiv:2503.20783},
  year={2025},
  month={Mar},
  url={https://arxiv.org/abs/2503.20783}
}
@article{flet2024contrastive,
  title={Contrastive Policy Gradient: Aligning LLMs on sequence-level scores in a supervised-friendly fashion},
  author={Flet-Berliac, Yannis and Grinsztajn, Nathan and Strub, Florian and Wu, Bill and Choi, Eugene and Cremer, Chris and Ahmadian, Arash and Chandak, Yash and Azar, Mohammad Gheshlaghi and Pietquin, Olivier and others},
  journal={arXiv preprint arXiv:2406.19185},
  year={2024}
}
# RLHF Methods ####################################################################
@article{BradleyTerry,
 ISSN = {00063444},
 URL = {http://www.jstor.org/stable/2334029},
 author = {Ralph Allan Bradley and Milton E. Terry},
 journal = {Biometrika},
 number = {3/4},
 pages = {324--345},
 publisher = {[Oxford University Press, Biometrika Trust]},
 title = {Rank Analysis of Incomplete Block Designs: I. The Method of Paired Comparisons},
 urldate = {2023-02-13},
 volume = {39},
 year = {1952}
}

@article{likert1932technique,
  title={A technique for the measurement of attitudes.},
  author={Likert, Rensis},
  journal={Archives of psychology},
  year={1932}
}

@article{gilks1992adaptive,
  title={Adaptive rejection sampling for Gibbs sampling},
  author={Gilks, Walter R and Wild, Pascal},
  journal={Journal of the Royal Statistical Society: Series C (Applied Statistics)},
  volume={41},
  number={2},
  pages={337--348},
  year={1992},
  publisher={Wiley Online Library}
}

# ReMax
@inproceedings{li2023remax,
  title={Remax: A simple, effective, and efficient reinforcement learning method for aligning large language models},
  author={Li, Ziniu and Xu, Tian and Zhang, Yushun and Lin, Zhihang and Yu, Yang and Sun, Ruoyu and Luo, Zhi-Quan},
  booktitle={Forty-first International Conference on Machine Learning},
  year={2023}
}
@article{team2025kimi,
  title={Kimi k1. 5: Scaling reinforcement learning with llms},
  author={Team, Kimi and Du, Angang and Gao, Bofei and Xing, Bowei and Jiang, Changjiu and Chen, Cheng and Li, Cheng and Xiao, Chenjun and Du, Chenzhuang and Liao, Chonghua and others},
  journal={arXiv preprint arXiv:2501.12599},
  year={2025}
}
@article{wu2023pairwise,
  title={Pairwise proximal policy optimization: Harnessing relative feedback for llm alignment},
  author={Wu, Tianhao and Zhu, Banghua and Zhang, Ruoyu and Wen, Zhaojin and Ramchandran, Kannan and Jiao, Jiantao},
  journal={arXiv preprint arXiv:2310.00212},
  year={2023}
}
@article{gunter2024apple,
  title={Apple intelligence foundation language models},
  author={Gunter, Tom and Wang, Zirui and Wang, Chong and Pang, Ruoming and Narayanan, Andy and Zhang, Aonan and Zhang, Bowen and Chen, Chen and Chiu, Chung-Cheng and Qiu, David and others},
  journal={arXiv preprint arXiv:2407.21075},
  year={2024}
}
@article{zhang2025improving,
  title={Improving LLM General Preference Alignment via Optimistic Online Mirror Descent},
  author={Zhang, Yuheng and Yu, Dian and Ge, Tao and Song, Linfeng and Zeng, Zhichen and Mi, Haitao and Jiang, Nan and Yu, Dong},
  journal={arXiv preprint arXiv:2502.16852},
  year={2025}
}
@article{tomar2020mirror,
  title={Mirror descent policy optimization},
  author={Tomar, Manan and Shani, Lior and Efroni, Yonathan and Ghavamzadeh, Mohammad},
  journal={arXiv preprint arXiv:2005.09814},
  year={2020}
}
################################################################################################

# Reward Modeling More ####################################################################
@article{lambert2024rewardbench,
  title={Rewardbench: Evaluating reward models for language modeling},
  author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and others},
  journal={arXiv preprint arXiv:2403.13787},
  year={2024}
}
@article{zhou2024rmb,
  title={RMB: Comprehensively Benchmarking Reward Models in LLM Alignment},
  author={Zhou, Enyu and Zheng, Guodong and Wang, Binghai and Xi, Zhiheng and Dou, Shihan and Bao, Rong and Shen, Wei and Xiong, Limao and Fan, Jessica and Mou, Yurong and others},
  journal={arXiv preprint arXiv:2410.09893},
  year={2024}
}
@article{liu2024rm,
  title={RM-bench: Benchmarking reward models of language models with subtlety and style},
  author={Liu, Yantao and Yao, Zijun and Min, Rui and Cao, Yixin and Hou, Lei and Li, Juanzi},
  journal={arXiv preprint arXiv:2410.16184},
  year={2024}
}
@article{liu2024rm,
  title={RM-bench: Benchmarking reward models of language models with subtlety and style},
  author={Liu, Yantao and Yao, Zijun and Min, Rui and Cao, Yixin and Hou, Lei and Li, Juanzi},
  journal={arXiv preprint arXiv:2410.16184},
  year={2024}
}
@article{gureja2024m,
  title={M-RewardBench: Evaluating Reward Models in Multilingual Settings},
  author={Gureja, Srishti and Miranda, Lester James V and Islam, Shayekh Bin and Maheshwary, Rishabh and Sharma, Drishti and Winata, Gusti and Lambert, Nathan and Ruder, Sebastian and Hooker, Sara and Fadaee, Marzieh},
  journal={arXiv preprint arXiv:2410.15522},
  year={2024}
}
@article{chen2024mj,
  title={MJ-Bench: Is Your Multimodal Reward Model Really a Good Judge for Text-to-Image Generation?},
  author={Chen, Zhaorun and Du, Yichao and Wen, Zichen and Zhou, Yiyang and Cui, Chenhang and Weng, Zhenzhen and Tu, Haoqin and Wang, Chaoqi and Tong, Zhengwei and Huang, Qinglan and others},
  journal={arXiv preprint arXiv:2407.04842},
  year={2024}
}
@article{wu2025rewordbench,
  title={reWordBench: Benchmarking and Improving the Robustness of Reward Models with Transformed Inputs},
  author={Wu, Zhaofeng and Yasunaga, Michihiro and Cohen, Andrew and Kim, Yoon and Celikyilmaz, Asli and Ghazvininejad, Marjan},
  journal={arXiv preprint arXiv:2503.11751},
  year={2025}
}
@article{yasunaga2025multimodal,
  title={Multimodal rewardbench: Holistic evaluation of reward models for vision language models},
  author={Yasunaga, Michihiro and Zettlemoyer, Luke and Ghazvininejad, Marjan},
  journal={arXiv preprint arXiv:2502.14191},
  year={2025}
}
@article{li2024vlrewardbench,
  title={VLRewardBench: A Challenging Benchmark for Vision-Language Generative Reward Models},
  author={Li, Lei and Wei, Yuancheng and Xie, Zhihui and Yang, Xuqing and Song, Yifan and Wang, Peiyi and An, Chenxin and Liu, Tianyu and Li, Sujian and Lin, Bill Yuchen and others},
  journal={arXiv preprint arXiv:2411.17451},
  year={2024}
}
@article{ruan2025vlrmbench,
  title={Vlrmbench: A comprehensive and challenging benchmark for vision-language reward models},
  author={Ruan, Jiacheng and Yuan, Wenzhen and Gao, Xian and Guo, Ye and Zhang, Daoxin and Xu, Zhe and Hu, Yao and Liu, Ting and Fu, Yuzhuo},
  journal={arXiv preprint arXiv:2503.07478},
  year={2025}
}
@article{song2025prmbench,
  title={PRMBench: A Fine-grained and Challenging Benchmark for Process-Level Reward Models},
  author={Song, Mingyang and Su, Zhaochen and Qu, Xiaoye and Zhou, Jiawei and Cheng, Yu},
  journal={arXiv preprint arXiv:2501.03124},
  year={2025}
}
@article{wang2025visualprm,
  title={VisualPRM: An Effective Process Reward Model for Multimodal Reasoning},
  author={Wang, Weiyun and Gao, Zhangwei and Chen, Lianjie and Chen, Zhe and Zhu, Jinguo and Zhao, Xiangyu and Liu, Yangzhou and Cao, Yue and Ye, Shenglong and Zhu, Xizhou and others},
  journal={arXiv preprint arXiv:2503.10291},
  year={2025}
}
@misc{tu2025vilbench,
  title = {ViLBench: A Suite for Vision-Language Process Reward Modeling},
  author = {Tu, Haoqin and Feng, Weitao and Chen, Hardy and Liu, Hui and Tang, Xianfeng and Xie, Cihang},
  year = {2025},
  month = {Mar},
  eprint = {2503.20271},
  archivePrefix = {arXiv},
  primaryClass = {cs.CV},
  url = {https://arxiv.org/abs/2503.20271}
}
@article{wen2024rethinking,
  title={Rethinking Reward Model Evaluation: Are We Barking up the Wrong Tree?},
  author={Wen, Xueru and Lou, Jie and Lu, Yaojie and Lin, Hongyu and Yu, Xing and Lu, Xinyu and He, Ben and Han, Xianpei and Zhang, Debing and Sun, Le},
  journal={arXiv preprint arXiv:2410.05584},
  year={2024}
}
@article{jin2024rag,
  title={RAG-RewardBench: Benchmarking Reward Models in Retrieval Augmented Generation for Preference Alignment},
  author={Jin, Zhuoran and Yuan, Hongbang and Men, Tianyi and Cao, Pengfei and Chen, Yubo and Liu, Kang and Zhao, Jun},
  journal={arXiv preprint arXiv:2412.13746},
  year={2024}
}
@article{frick2024evaluate,
  title={How to Evaluate Reward Models for RLHF},
  author={Frick, Evan and Li, Tianle and Chen, Connor and Chiang, Wei-Lin and Angelopoulos, Anastasios N and Jiao, Jiantao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion},
  journal={arXiv preprint arXiv:2410.14872},
  year={2024}
}
@article{kim2024evaluating,
  title={Evaluating robustness of reward models for mathematical reasoning},
  author={Kim, Sunghwan and Kang, Dongjin and Kwon, Taeyoon and Chae, Hyungjoo and Won, Jungsoo and Lee, Dongha and Yeo, Jinyoung},
  journal={arXiv preprint arXiv:2410.01729},
  year={2024}
}
@inproceedings{zhu2023principled,
  title={Principled reinforcement learning with human feedback from pairwise or k-wise comparisons},
  author={Zhu, Banghua and Jordan, Michael and Jiao, Jiantao},
  booktitle={International Conference on Machine Learning},
  pages={43037--43067},
  year={2023},
  organization={PMLR}
}
@article{wang2024interpretable,
  title={Interpretable Preferences via Multi-Objective Reward Modeling and Mixture-of-Experts},
  author={Wang, Haoxiang and Xiong, Wei and Xie, Tengyang and Zhao, Han and Zhang, Tong},
  journal={arXiv preprint arXiv:2406.12845},
  year={2024}
}
@article{zhang2024generative,
  title={Generative verifiers: Reward modeling as next-token prediction},
  author={Zhang, Lunjun and Hosseini, Arian and Bansal, Hritik and Kazemi, Mehran and Kumar, Aviral and Agarwal, Rishabh},
  journal={arXiv preprint arXiv:2408.15240},
  year={2024}
}
@article{mahan2024generative,
  title={Generative Reward Models},
  author={Mahan, Dakota and Phung, Duy Van and Rafailov, Rafael and Blagden, Chase and Lile, Nathan and Castricato, Louis and Franken, Jan-Philipp and Finn, Chelsea and Albalak, Alon},
  year={2024},
  url={https://www.synthlabs.ai/pdf/Generative_Reward_Models.pdf}
}
@article{wang2024helpsteer2,
  title={HelpSteer2: Open-source dataset for training top-performing reward models},
  author={Wang, Zhilin and Dong, Yi and Delalleau, Olivier and Zeng, Jiaqi and Shen, Gerald and Egert, Daniel and Zhang, Jimmy J and Sreedhar, Makesh Narsimhan and Kuchaiev, Oleksii},
  journal={arXiv preprint arXiv:2406.08673},
  year={2024}
}
@misc{no_robots,
  author = {Nazneen Rajani and Lewis Tunstall and Edward Beeching and Nathan Lambert and Alexander M. Rush and Thomas Wolf},
  title = {No Robots},
  year = {2023},
  publisher = {Hugging Face},
  journal = {Hugging Face repository},
  howpublished = {\url{https://huggingface.co/datasets/HuggingFaceH4/no_robots}}
}
@article{wang2024helpsteer2p,
  title={HelpSteer2-Preference: Complementing Ratings with Preferences},
  author={Wang, Zhilin and Bukharin, Alexander and Delalleau, Olivier and Egert, Daniel and Shen, Gerald and Zeng, Jiaqi and Kuchaiev, Oleksii and Dong, Yi},
  journal={arXiv preprint arXiv:2410.01257},
  year={2024}
}
@article{adler2024nemotron,
  title={Nemotron-4 340B Technical Report},
  author={Adler, Bo and Agarwal, Niket and Aithal, Ashwath and Anh, Dong H and Bhattacharya, Pallab and Brundyn, Annika and Casper, Jared and Catanzaro, Bryan and Clay, Sharon and Cohen, Jonathan and others},
  journal={arXiv preprint arXiv:2406.11704},
  year={2024}
}
@article{ankner2024critique,
  title={Critique-out-loud reward models},
  author={Ankner, Zachary and Paul, Mansheej and Cui, Brandon and Chang, Jonathan D and Ammanabrolu, Prithviraj},
  journal={arXiv preprint arXiv:2408.11791},
  year={2024}
}
@article{park2024offsetbias,
  title={Offsetbias: Leveraging debiased data for tuning evaluators},
  author={Park, Junsoo and Jwa, Seungyeon and Ren, Meiying and Kim, Daeyoung and Choi, Sanghyuk},
  journal={arXiv preprint arXiv:2407.06551},
  year={2024}
}
@article{liu2025inference,
  title={Inference-Time Scaling for Generalist Reward Modeling},
  author={Liu, Zijun and Wang, Peiyi and Xu, Runxin and Ma, Shirong and Ruan, Chong and Li, Peng and Liu, Yang and Wu, Yu},
  journal={arXiv preprint arXiv:2504.02495},
  year={2025}
}
################################################################################################

# KL Refs ####################################################################
@article{jaques2020human,
  title={Human-centric dialog training via offline reinforcement learning},
  author={Jaques, Natasha and Shen, Judy Hanwen and Ghandeharioun, Asma and Ferguson, Craig and Lapedriza, Agata and Jones, Noah and Gu, Shixiang Shane and Picard, Rosalind},
  journal={arXiv preprint arXiv:2010.05848},
  year={2020}
}
@inproceedings{jaques2017sequence,
  title={Sequence tutor: Conservative fine-tuning of sequence generation models with kl-control},
  author={Jaques, Natasha and Gu, Shixiang and Bahdanau, Dzmitry and Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Turner, Richard E and Eck, Douglas},
  booktitle={International Conference on Machine Learning},
  pages={1645--1654},
  year={2017},
  organization={PMLR}
}
@inproceedings{havrilla-etal-2023-trlx,
    title = "trl{X}: A Framework for Large Scale Reinforcement Learning from Human Feedback",
    author = "Havrilla, Alexander  and
      Zhuravinskyi, Maksym  and
      Phung, Duy  and
      Tiwari, Aman  and
      Tow, Jonathan  and
      Biderman, Stella  and
      Anthony, Quentin  and
      Castricato, Louis",
    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.emnlp-main.530",
    doi = "10.18653/v1/2023.emnlp-main.530",
    pages = "8578--8595",
}
@misc{vonwerra2022trl,
  author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
  title = {TRL: Transformer Reinforcement Learning},
  year = {2020},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huggingface/trl}}
}
@misc{sharma2024critical,
      title={A Critical Evaluation of AI Feedback for Aligning Large Language Models}, 
      author={Archit Sharma and Sedrick Keh and Eric Mitchell and Chelsea Finn and Kushal Arora and Thomas Kollar},
      year={2024},
      eprint={2402.12366},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{castricato2024suppressing,
      title={Suppressing Pink Elephants with Direct Principle Feedback}, 
      author={Louis Castricato and Nathan Lile and Suraj Anand and Hailey Schoelkopf and Siddharth Verma and Stella Biderman},
      year={2024},
      eprint={2402.07896},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

# RLHF Core ####################################################################
@article{christiano2017deep,
  title={Deep reinforcement learning from human preferences},
  author={Christiano, Paul F and Leike, Jan and Brown, Tom and Martic, Miljan and Legg, Shane and Amodei, Dario},
  journal={Advances in neural information processing systems},
  volume={30},
  year={2017}
}
@article{ibarz2018reward,
  title={Reward learning from human preferences and demonstrations in atari},
  author={Ibarz, Borja and Leike, Jan and Pohlen, Tobias and Irving, Geoffrey and Legg, Shane and Amodei, Dario},
  journal={Advances in neural information processing systems},
  volume={31},
  year={2018}
}
@article{leike2018scalable,
  title={Scalable agent alignment via reward modeling: a research direction},
  author={Leike, Jan and Krueger, David and Everitt, Tom and Martic, Miljan and Maini, Vishal and Legg, Shane},
  journal={arXiv preprint arXiv:1811.07871},
  year={2018}
}
@article{ziegler2019fine,
  title={Fine-tuning language models from human preferences},
  author={Ziegler, Daniel M and Stiennon, Nisan and Wu, Jeffrey and Brown, Tom B and Radford, Alec and Amodei, Dario and Christiano, Paul and Irving, Geoffrey},
  journal={arXiv preprint arXiv:1909.08593},
  year={2019}
}
@article{stiennon2020learning,
  title={Learning to summarize with human feedback},
  author={Stiennon, Nisan and Ouyang, Long and Wu, Jeffrey and Ziegler, Daniel and Lowe, Ryan and Voss, Chelsea and Radford, Alec and Amodei, Dario and Christiano, Paul F},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  pages={3008--3021},
  year={2020}
}
@article{wu2021recursively,
  title={Recursively summarizing books with human feedback},
  author={Wu, Jeff and Ouyang, Long and Ziegler, Daniel M and Stiennon, Nisan and Lowe, Ryan and Leike, Jan and Christiano, Paul},
  journal={arXiv preprint arXiv:2109.10862},
  year={2021}
}


@article{nakano2021webgpt,
  title={Webgpt: Browser-assisted question-answering with human feedback},
  author={Nakano, Reiichiro and Hilton, Jacob and Balaji, Suchir and Wu, Jeff and Ouyang, Long and Kim, Christina and Hesse, Christopher and Jain, Shantanu and Kosaraju, Vineet and Saunders, William and others},
  journal={arXiv preprint arXiv:2112.09332},
  year={2021}
}
@article{ouyang2022training,
  title={Training language models to follow instructions with human feedback},
  author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
  journal={Advances in neural information processing systems},
  volume={35},
  pages={27730--27744},
  year={2022}
}
@article{askell2021general,
  title={A general language assistant as a laboratory for alignment},
  author={Askell, Amanda and Bai, Yuntao and Chen, Anna and Drain, Dawn and Ganguli, Deep and Henighan, Tom and Jones, Andy and Joseph, Nicholas and Mann, Ben and DasSarma, Nova and others},
  journal={arXiv preprint arXiv:2112.00861},
  year={2021}
}
@article{bai2022training,
  title={Training a helpful and harmless assistant with reinforcement learning from human feedback},
  author={Bai, Yuntao and Jones, Andy and Ndousse, Kamal and Askell, Amanda and Chen, Anna and DasSarma, Nova and Drain, Dawn and Fort, Stanislav and Ganguli, Deep and Henighan, Tom and others},
  journal={arXiv preprint arXiv:2204.05862},
  year={2022}
}
@article{ganguli2022red,
  title={Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned},
  author={Ganguli, Deep and Lovitt, Liane and Kernion, Jackson and Askell, Amanda and Bai, Yuntao and Kadavath, Saurav and Mann, Ben and Perez, Ethan and Schiefer, Nicholas and Ndousse, Kamal and others},
  journal={arXiv preprint arXiv:2209.07858},
  year={2022}
}
@article{glaese2022improving,
  title={Improving alignment of dialogue agents via targeted human judgements},
  author={Glaese, Amelia and McAleese, Nat and Tr{\k{e}}bacz, Maja and Aslanides, John and Firoiu, Vlad and Ewalds, Timo and Rauh, Maribeth and Weidinger, Laura and Chadwick, Martin and Thacker, Phoebe and others},
  journal={arXiv preprint arXiv:2209.14375},
  year={2022}
}
@article{menick2022teaching,
  title={Teaching language models to support answers with verified quotes},
  author={Menick, Jacob and Trebacz, Maja and Mikulik, Vladimir and Aslanides, John and Song, Francis and Chadwick, Martin and Glaese, Mia and Young, Susannah and Campbell-Gillingham, Lucy and Irving, Geoffrey and others},
  journal={arXiv preprint arXiv:2203.11147},
  year={2022}
}
@article{lambert2022illustrating,
  author = {Lambert, Nathan and Castricato, Louis and von Werra, Leandro and Havrilla, Alex},
  title = {Illustrating Reinforcement Learning from Human Feedback (RLHF)},
  journal = {Hugging Face Blog},
  year = {2022},
  note = {https://huggingface.co/blog/rlhf},
}

@article{bai2022constitutional,
  title={Constitutional ai: Harmlessness from ai feedback},
  author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others},
  journal={arXiv preprint arXiv:2212.08073},
  year={2022}
}

@article{lightman2023let,
  title={Let's verify step by step},
  author={Lightman, Hunter and Kosaraju, Vineet and Burda, Yura and Edwards, Harri and Baker, Bowen and Lee, Teddy and Leike, Jan and Schulman, John and Sutskever, Ilya and Cobbe, Karl},
  journal={arXiv preprint arXiv:2305.20050},
  year={2023}
}

@article{touvron2023llama,
  title={Llama 2: Open foundation and fine-tuned chat models},
  author={Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and others},
  journal={arXiv preprint arXiv:2307.09288},
  year={2023}
}
@inproceedings{gao2023scaling,
  title={Scaling laws for reward model overoptimization},
  author={Gao, Leo and Schulman, John and Hilton, Jacob},
  booktitle={International Conference on Machine Learning},
  pages={10835--10866},
  year={2023},
  organization={PMLR}
}
@misc{yuan2025selfrewardinglanguagemodels,
      title={Self-Rewarding Language Models}, 
      author={Weizhe Yuan and Richard Yuanzhe Pang and Kyunghyun Cho and Xian Li and Sainbayar Sukhbaatar and Jing Xu and Jason Weston},
      year={2025},
      eprint={2401.10020},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2401.10020}, 
}
@article{shao2024deepseekmath,
  title={Deepseekmath: Pushing the limits of mathematical reasoning in open language models},
  author={Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Bi, Xiao and Zhang, Haowei and Zhang, Mingchuan and Li, YK and Wu, Y and others},
  journal={arXiv preprint arXiv:2402.03300},
  year={2024}
}
@article{adler2024nemotron,
  title={Nemotron-4 340B Technical Report},
  author={Adler, Bo and Agarwal, Niket and Aithal, Ashwath and Anh, Dong H and Bhattacharya, Pallab and Brundyn, Annika and Casper, Jared and Catanzaro, Bryan and Clay, Sharon and Cohen, Jonathan and others},
  journal={arXiv preprint arXiv:2406.11704},
  year={2024}
}
@article{dubey2024llama,
  title={The llama 3 herd of models},
  author={Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Yang, Amy and Fan, Angela and others},
  journal={arXiv preprint arXiv:2407.21783},
  year={2024}
}
@article{rafailov2024direct,
  title={Direct preference optimization: Your language model is secretly a reward model},
  author={Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Manning, Christopher D and Ermon, Stefano and Finn, Chelsea},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024}
}

@article{lambert2024t,
  title={T$\backslash$" ULU 3: Pushing Frontiers in Open Language Model Post-Training},
  author={Lambert, Nathan and Morrison, Jacob and Pyatkin, Valentina and Huang, Shengyi and Ivison, Hamish and Brahman, Faeze and Miranda, Lester James V and Liu, Alisa and Dziri, Nouha and Lyu, Shane and others},
  journal={arXiv preprint arXiv:2411.15124},
  year={2024}
}
@article{liu2024deepseek,
  title={Deepseek-v3 technical report},
  author={Liu, Aixin and Feng, Bei and Xue, Bing and Wang, Bingxuan and Wu, Bochao and Lu, Chengda and Zhao, Chenggang and Deng, Chengqi and Zhang, Chenyu and Ruan, Chong and others},
  journal={arXiv preprint arXiv:2412.19437},
  year={2024}
}
@article{guo2025deepseek,
  title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
  author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
  journal={arXiv preprint arXiv:2501.12948},
  year={2025}
}
# Inference time scaling ##############################################################
@misc{irpan2018deep,
  title={Deep Reinforcement Learning Doesn't Work Yet},
  author={Alex Irpan},
  year={2018},
  url={https://www.alexirpan.com/2018/02/14/rl-hard.html}
}
@inproceedings{henderson2018deep,
  title={Deep Reinforcement Learning that Matters},
  author={Henderson, Peter and Islam, Riashat and Bachman, Philip and Pineau, Joelle and Precup, Doina and Meger, David},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={32},
  number={1},
  year={2018},
  url={https://ojs.aaai.org/index.php/AAAI/article/view/11694}
}
@article{liu2023don,
  title={Don't throw away your value model! Generating more preferable text with Value-Guided Monte-Carlo Tree Search decoding},
  author={Liu, Jiacheng and Cohen, Andrew and Pasunuru, Ramakanth and Choi, Yejin and Hajishirzi, Hannaneh and Celikyilmaz, Asli},
  journal={arXiv preprint arXiv:2309.15028},
  year={2023}
}
@article{chen2024more,
  title={Are more llm calls all you need? towards scaling laws of compound inference systems},
  author={Chen, Lingjiao and Davis, Jared Quincy and Hanin, Boris and Bailis, Peter and Stoica, Ion and Zaharia, Matei and Zou, James},
  journal={arXiv preprint arXiv:2403.02419},
  year={2024}
}
@article{brown2024large,
  title={Large language monkeys: Scaling inference compute with repeated sampling},
  author={Brown, Bradley and Juravsky, Jordan and Ehrlich, Ryan and Clark, Ronald and Le, Quoc V and R{\'e}, Christopher and Mirhoseini, Azalia},
  journal={arXiv preprint arXiv:2407.21787},
  year={2024}
}
@article{snell2024scaling,
  title={Scaling llm test-time compute optimally can be more effective than scaling model parameters},
  author={Snell, Charlie and Lee, Jaehoon and Xu, Kelvin and Kumar, Aviral},
  journal={arXiv preprint arXiv:2408.03314},
  year={2024}
}
@article{muennighoff2025s1,
  title={s1: Simple test-time scaling},
  author={Muennighoff, Niklas and Yang, Zitong and Shi, Weijia and Li, Xiang Lisa and Fei-Fei, Li and Hajishirzi, Hannaneh and Zettlemoyer, Luke and Liang, Percy and Cand{\`e}s, Emmanuel and Hashimoto, Tatsunori},
  journal={arXiv preprint arXiv:2501.19393},
  year={2025}
}
@article{sheng2024hybridflow,
  title   = {HybridFlow: A Flexible and Efficient RLHF Framework},
  author  = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
  year    = {2024},
  journal = {arXiv preprint arXiv: 2409.19256}
}
@article{hu2024openrlhf,
  title={OpenRLHF: An Easy-to-use, Scalable and High-performance RLHF Framework},
  author={Jian Hu and Xibin Wu and Zilin Zhu and Xianyu and Weixun Wang and Dehao Zhang and Yu Cao},
  journal={arXiv preprint arXiv:2405.11143},
  year={2024}
}
# end of inference time scaling
# RLHF More ########################################################################
@article{lee2023rlaif,
  title={Rlaif: Scaling reinforcement learning from human feedback with ai feedback},
  author={Lee, Harrison and Phatale, Samrat and Mansoor, Hassan and Lu, Kellie Ren and Mesnard, Thomas and Ferret, Johan and Bishop, Colton and Hall, Ethan and Carbune, Victor and Rastogi, Abhinav},
  year={2023}
}
@misc{huang2024putting,
  author       = {Shengyi Costa Huang and Arash Ahmadian and Cohere For AI},
  title        = {Putting RL back in RLHF},
  year         = {2024},
  howpublished = {\url{https://huggingface.co/blog/putting_rl_back_in_rlhf_with_rloo}},
  note         = {Accessed: 2025-01-15}
}
@article{wu2024fine,
  title={Fine-grained human feedback gives better rewards for language model training},
  author={Wu, Zeqiu and Hu, Yushi and Shi, Weijia and Dziri, Nouha and Suhr, Alane and Ammanabrolu, Prithviraj and Smith, Noah A and Ostendorf, Mari and Hajishirzi, Hannaneh},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024}
}
@article{kirk2023understanding,
  title={Understanding the effects of rlhf on llm generalisation and diversity},
  author={Kirk, Robert and Mediratta, Ishita and Nalmpantis, Christoforos and Luketina, Jelena and Hambro, Eric and Grefenstette, Edward and Raileanu, Roberta},
  journal={arXiv preprint arXiv:2310.06452},
  year={2023}
}
@article{chu2025sft,
  title={Sft memorizes, rl generalizes: A comparative study of foundation model post-training},
  author={Chu, Tianzhe and Zhai, Yuexiang and Yang, Jihan and Tong, Shengbang and Xie, Saining and Schuurmans, Dale and Le, Quoc V and Levine, Sergey and Ma, Yi},
  journal={arXiv preprint arXiv:2501.17161},
  year={2025}
}
@article{chen2024learning,
  title={Learning from Natural Language Feedback},
  author={Chen, Angelica and Scheurer, J{\'e}r{\'e}my and Campos, Jon Ander and Korbak, Tomasz and Chan, Jun Shern and Bowman, Samuel R and Cho, Kyunghyun and Perez, Ethan},
  journal={Transactions on Machine Learning Research},
  year={2024}
}
@article{ahmadian2024back,
  title={Back to basics: Revisiting reinforce style optimization for learning from human feedback in llms},
  author={Ahmadian, Arash and Cremer, Chris and Gall{\'e}, Matthias and Fadaee, Marzieh and Kreutzer, Julia and Pietquin, Olivier and {\"U}st{\"u}n, Ahmet and Hooker, Sara},
  journal={arXiv preprint arXiv:2402.14740},
  year={2024}
}

@article{pang2024iterative,
  title={Iterative reasoning preference optimization},
  author={Pang, Richard Yuanzhe and Yuan, Weizhe and Cho, Kyunghyun and He, He and Sukhbaatar, Sainbayar and Weston, Jason},
  journal={arXiv preprint arXiv:2404.19733},
  year={2024}
}
@article{cohen2022dynamic,
  title={Dynamic planning in open-ended dialogue using reinforcement learning},
  author={Cohen, Deborah and Ryu, Moonkyung and Chow, Yinlam and Keller, Orgad and Greenberg, Ido and Hassidim, Avinatan and Fink, Michael and Matias, Yossi and Szpektor, Idan and Boutilier, Craig and others},
  journal={arXiv preprint arXiv:2208.02294},
  year={2022}
}
@article{ramamurthy2022reinforcement,
  title={Is reinforcement learning (not) for natural language processing: Benchmarks, baselines, and building blocks for natural language policy optimization},
  author={Ramamurthy, Rajkumar and Ammanabrolu, Prithviraj and Brantley, Kiant{\'e} and Hessel, Jack and Sifa, Rafet and Bauckhage, Christian and Hajishirzi, Hannaneh and Choi, Yejin},
  journal={arXiv preprint arXiv:2210.01241},
  year={2022}
}

@article{gao2024rebel,
  title={Rebel: Reinforcement learning via regressing relative rewards},
  author={Gao, Zhaolin and Chang, Jonathan D and Zhan, Wenhao and Oertell, Owen and Swamy, Gokul and Brantley, Kiant{\'e} and Joachims, Thorsten and Bagnell, J Andrew and Lee, Jason D and Sun, Wen},
  journal={arXiv preprint arXiv:2404.16767},
  year={2024}
}
% overopt
@article{zhang2018study,
  title={A study on overfitting in deep reinforcement learning},
  author={Zhang, Chiyuan and Vinyals, Oriol and Munos, Remi and Bengio, Samy},
  journal={arXiv preprint arXiv:1804.06893},
  year={2018}
}
@article{hoskin1996awful,
  title={The ‘awful idea of accountability’: inscribing people into the measurement of objects},
  author={Hoskin, Keith},
  journal={Accountability: Power, ethos and the technologies of managing},
  volume={265},
  year={1996},
  publisher={International Thomson Business Press London}
}
@article{coste2023reward,
  title={Reward model ensembles help mitigate overoptimization},
  author={Coste, Thomas and Anwar, Usman and Kirk, Robert and Krueger, David},
  journal={arXiv preprint arXiv:2310.02743},
  year={2023}
}
@article{moskovitz2023confronting,
  title={Confronting reward model overoptimization with constrained RLHF},
  author={Moskovitz, Ted and Singh, Aaditya K and Strouse, DJ and Sandholm, Tuomas and Salakhutdinov, Ruslan and Dragan, Anca D and McAleer, Stephen},
  journal={arXiv preprint arXiv:2310.04373},
  year={2023}
}
@article{rafailov2024scaling,
  title={Scaling laws for reward model overoptimization in direct alignment algorithms},
  author={Rafailov, Rafael and Chittepu, Yaswanth and Park, Ryan and Sikchi, Harshit Sushil and Hejna, Joey and Knox, Brad and Finn, Chelsea and Niekum, Scott},
  journal={Advances in Neural Information Processing Systems},
  volume={37},
  pages={126207--126242},
  year={2024}
}
@article{rottger2023xstest,
  title={Xstest: A test suite for identifying exaggerated safety behaviours in large language models},
  author={R{\"o}ttger, Paul and Kirk, Hannah Rose and Vidgen, Bertie and Attanasio, Giuseppe and Bianchi, Federico and Hovy, Dirk},
  journal={arXiv preprint arXiv:2308.01263},
  year={2023}
}
@article{han2024wildguard,
  title={Wildguard: Open one-stop moderation tools for safety risks, jailbreaks, and refusals of llms},
  author={Han, Seungju and Rao, Kavel and Ettinger, Allyson and Jiang, Liwei and Lin, Bill Yuchen and Lambert, Nathan and Choi, Yejin and Dziri, Nouha},
  journal={arXiv preprint arXiv:2406.18495},
  year={2024}
}
@article{inan2023llama,
  title={Llama guard: Llm-based input-output safeguard for human-ai conversations},
  author={Inan, Hakan and Upasani, Kartikeya and Chi, Jianfeng and Rungta, Rashi and Iyer, Krithika and Mao, Yuning and Tontchev, Michael and Hu, Qing and Fuller, Brian and Testuggine, Davide and others},
  journal={arXiv preprint arXiv:2312.06674},
  year={2023}
}
% dpo methods
@inproceedings{azar2024general,
  title={A general theoretical paradigm to understand learning from human preferences},
  author={Azar, Mohammad Gheshlaghi and Guo, Zhaohan Daniel and Piot, Bilal and Munos, Remi and Rowland, Mark and Valko, Michal and Calandriello, Daniele},
  booktitle={International Conference on Artificial Intelligence and Statistics},
  pages={4447--4455},
  year={2024},
  organization={PMLR}
}
@article{zhao2023slic,
  title={Slic-hf: Sequence likelihood calibration with human feedback},
  author={Zhao, Yao and Joshi, Rishabh and Liu, Tianqi and Khalman, Misha and Saleh, Mohammad and Liu, Peter J},
  journal={arXiv preprint arXiv:2305.10425},
  year={2023}
}
@article{singhal2024d2po,
  title={D2po: Discriminator-guided dpo with response evaluation models},
  author={Singhal, Prasann and Lambert, Nathan and Niekum, Scott and Goyal, Tanya and Durrett, Greg},
  journal={arXiv preprint arXiv:2405.01511},
  year={2024}
}
@article{guo2024direct,
  title={Direct language model alignment from online ai feedback},
  author={Guo, Shangmin and Zhang, Biao and Liu, Tianlin and Liu, Tianqi and Khalman, Misha and Llinares, Felipe and Rame, Alexandre and Mesnard, Thomas and Zhao, Yao and Piot, Bilal and others},
  journal={arXiv preprint arXiv:2402.04792},
  year={2024}
}
@article{amini2024direct,
  title={Direct preference optimization with an offset},
  author={Amini, Afra and Vieira, Tim and Cotterell, Ryan},
  journal={arXiv preprint arXiv:2402.10571},
  year={2024}
}
@article{hong2024reference,
  title={Reference-free monolithic preference optimization with odds ratio},
  author={Hong, Jiwoo and Lee, Noah and Thorne, James},
  journal={arXiv e-prints},
  pages={arXiv--2403},
  year={2024}
}
@article{meng2025simpo,
  title={Simpo: Simple preference optimization with a reference-free reward},
  author={Meng, Yu and Xia, Mengzhou and Chen, Danqi},
  journal={Advances in Neural Information Processing Systems},
  volume={37},
  pages={124198--124235},
  year={2025}
}
@article{zhao2024rainbowpo,
  title={Rainbowpo: A unified framework for combining improvements in preference optimization},
  author={Zhao, Hanyang and Winata, Genta Indra and Das, Anirban and Zhang, Shi-Xiong and Yao, David D and Tang, Wenpin and Sahu, Sambit},
  journal={arXiv preprint arXiv:2410.04203},
  year={2024}
}
@article{rosset2024direct,
  title={Direct nash optimization: Teaching language models to self-improve with general preferences},
  author={Rosset, Corby and Cheng, Ching-An and Mitra, Arindam and Santacroce, Michael and Awadallah, Ahmed and Xie, Tengyang},
  journal={arXiv preprint arXiv:2404.03715},
  year={2024}
}
@article{jung2024binary,
  title={Binary classifier optimization for large language model alignment},
  author={Jung, Seungjae and Han, Gunsoo and Nam, Daniel Wontae and On, Kyoung-Woon},
  journal={arXiv preprint arXiv:2404.04656},
  year={2024}
}
@article{gorbatovski2025differences,
  title={The Differences Between Direct Alignment Algorithms are a Blur},
  author={Gorbatovski, Alexey and Shaposhnikov, Boris and Sinii, Viacheslav and Malakhov, Alexey and Gavrilov, Daniil},
  journal={arXiv preprint arXiv:2502.01237},
  year={2025}
}
@article{razin2024unintentional,
  title={Unintentional unalignment: Likelihood displacement in direct preference optimization},
  author={Razin, Noam and Malladi, Sadhika and Bhaskar, Adithya and Chen, Danqi and Arora, Sanjeev and Hanin, Boris},
  journal={arXiv preprint arXiv:2410.08847},
  year={2024}
}
@article{ren2024learning,
  title={Learning dynamics of llm finetuning},
  author={Ren, Yi and Sutherland, Danica J},
  journal={arXiv preprint arXiv:2407.10490},
  year={2024}
}
@article{xiao2024cal,
  title={Cal-dpo: Calibrated direct preference optimization for language model alignment},
  author={Xiao, Teng and Yuan, Yige and Zhu, Huaisheng and Li, Mingxiao and Honavar, Vasant G},
  journal={arXiv preprint arXiv:2412.14516},
  year={2024}
}
@article{gupta2025alphapo,
  title={AlphaPO--Reward shape matters for LLM alignment},
  author={Gupta, Aman and Tang, Shao and Song, Qingquan and Zhu, Sirou and Hong, Jiwoo and Saha, Ankan and Gupta, Viral and Lee, Noah and Kim, Eunki and Zhu, Siyu and others},
  journal={arXiv preprint arXiv:2501.03884},
  year={2025}
}
@article{park2024disentangling,
  title={Disentangling length from quality in direct preference optimization},
  author={Park, Ryan and Rafailov, Rafael and Ermon, Stefano and Finn, Chelsea},
  journal={arXiv preprint arXiv:2403.19159},
  year={2024}
}
% end dpo methods
@article{singhal2023long,
  title={A long way to go: Investigating length correlations in rlhf},
  author={Singhal, Prasann and Goyal, Tanya and Xu, Jiacheng and Durrett, Greg},
  journal={arXiv preprint arXiv:2310.03716},
  year={2023}
}
@article{miranda2024hybrid,
  title={Hybrid Preferences: Learning to Route Instances for Human vs. AI Feedback},
  author={Miranda, Lester James V and Wang, Yizhong and Elazar, Yanai and Kumar, Sachin and Pyatkin, Valentina and Brahman, Faeze and Smith, Noah A and Hajishirzi, Hannaneh and Dasigi, Pradeep},
  journal={arXiv preprint arXiv:2410.19133},
  year={2024}
}
@article{tajwar2024preference,
  title={Preference fine-tuning of llms should leverage suboptimal, on-policy data},
  author={Tajwar, Fahim and Singh, Anikait and Sharma, Archit and Rafailov, Rafael and Schneider, Jeff and Xie, Tengyang and Ermon, Stefano and Finn, Chelsea and Kumar, Aviral},
  journal={arXiv preprint arXiv:2404.14367},
  year={2024}
}
@article{casper2023open,
  title={Open problems and fundamental limitations of reinforcement learning from human feedback},
  author={Casper, Stephen and Davies, Xander and Shi, Claudia and Gilbert, Thomas Krendl and Scheurer, J{\'e}r{\'e}my and Rando, Javier and Freedman, Rachel and Korbak, Tomasz and Lindner, David and Freire, Pedro and others},
  journal={arXiv preprint arXiv:2307.15217},
  year={2023}
}
@article{wallace2024instruction,
  title={The instruction hierarchy: Training llms to prioritize privileged instructions},
  author={Wallace, Eric and Xiao, Kai and Leike, Reimar and Weng, Lilian and Heidecke, Johannes and Beutel, Alex},
  journal={arXiv preprint arXiv:2404.13208},
  year={2024}
}
@article{kumar2024training,
  title={Training language models to self-correct via reinforcement learning},
  author={Kumar, Aviral and Zhuang, Vincent and Agarwal, Rishabh and Su, Yi and Co-Reyes, John D and Singh, Avi and Baumli, Kate and Iqbal, Shariq and Bishop, Colton and Roelofs, Rebecca and others},
  journal={arXiv preprint arXiv:2409.12917},
  year={2024}
}
@article{singh2023beyond,
  title={Beyond human data: Scaling self-training for problem-solving with language models},
  author={Singh, Avi and Co-Reyes, John D and Agarwal, Rishabh and Anand, Ankesh and Patil, Piyush and Liu, Peter J and Harrison, James and Lee, Jaehoon and Xu, Kelvin and Parisi, Aaron and others},
  journal={arXiv preprint arXiv:2312.06585},
  year={2023}
}
@misc{openai2024o1,
  title        = {Introducing OpenAI o1-preview},
  author       = {{OpenAI}},
  year         = {2024},
  month        = sep,
  url          = {https://openai.com/index/introducing-openai-o1-preview/},
  note         = {Accessed: 2024-10-18}
}
@article{cobbe2021training,
  title={Training verifiers to solve math word problems},
  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
  journal={arXiv preprint arXiv:2110.14168},
  year={2021}
}
@article{lightman2023let,
  title={Let's verify step by step},
  author={Lightman, Hunter and Kosaraju, Vineet and Burda, Yura and Edwards, Harri and Baker, Bowen and Lee, Teddy and Leike, Jan and Schulman, John and Sutskever, Ilya and Cobbe, Karl},
  journal={arXiv preprint arXiv:2305.20050},
  year={2023}
}
@inproceedings{lu2011learning,
  title={Learning Mallows models with pairwise preferences},
  author={Lu, Tyler and Boutilier, Craig},
  booktitle={Proceedings of the 28th international conference on machine learning (icml-11)},
  pages={145--152},
  year={2011}
}
@inproceedings{liu2019learning,
  title={Learning plackett-luce mixtures from partial preferences},
  author={Liu, Ao and Zhao, Zhibing and Liao, Chao and Lu, Pinyan and Xia, Lirong},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={33},
  number={01},
  pages={4328--4335},
  year={2019}
}
@inproceedings{zhu2024starling,
  title={Starling-7b: Improving helpfulness and harmlessness with rlaif},
  author={Zhu, Banghua and Frick, Evan and Wu, Tianhao and Zhu, Hanlin and Ganesan, Karthik and Chiang, Wei-Lin and Zhang, Jian and Jiao, Jiantao},
  booktitle={First Conference on Language Modeling},
  year={2024}
}
@article{wang2023openchat,
  title={Openchat: Advancing open-source language models with mixed-quality data},
  author={Wang, Guan and Cheng, Sijie and Zhan, Xianyuan and Li, Xiangang and Song, Sen and Liu, Yang},
  journal={arXiv preprint arXiv:2309.11235},
  year={2023}
}
@article{chiang2024chatbot,
  title={Chatbot arena: An open platform for evaluating llms by human preference},
  author={Chiang, Wei-Lin and Zheng, Lianmin and Sheng, Ying and Angelopoulos, Anastasios Nikolas and Li, Tianle and Li, Dacheng and Zhang, Hao and Zhu, Banghua and Jordan, Michael and Gonzalez, Joseph E and others},
  journal={arXiv preprint arXiv:2403.04132},
  year={2024}
}
@article{zhou2023instruction,
  title={Instruction-following evaluation for large language models},
  author={Zhou, Jeffrey and Lu, Tianjian and Mishra, Swaroop and Brahma, Siddhartha and Basu, Sujoy and Luan, Yi and Zhou, Denny and Hou, Le},
  journal={arXiv preprint arXiv:2311.07911},
  year={2023}
}
@article{ethayarajh2024kto,
  title={Kto: Model alignment as prospect theoretic optimization},
  author={Ethayarajh, Kawin and Xu, Winnie and Muennighoff, Niklas and Jurafsky, Dan and Kiela, Douwe},
  journal={arXiv preprint arXiv:2402.01306},
  year={2024}
}
@article{lyu2025exploring,
  title={Exploring the Limit of Outcome Reward for Learning Mathematical Reasoning},
  author={Lyu, Chengqi and Gao, Songyang and Gu, Yuzhe and Zhang, Wenwei and Gao, Jianfei and Liu, Kuikun and Wang, Ziyi and Li, Shuaibin and Zhao, Qian and Huang, Haian and others},
  journal={arXiv preprint arXiv:2502.06781},
  year={2025}
}
@misc{openai2024modelspec,
  title        = {Introducing the Model Spec},
  author       = {{OpenAI}},
  year         = 2024,
  month        = may,
  url          = {https://openai.com/index/introducing-the-model-spec/},
  note         = {Accessed: 2025-02-14}
}
@article{guan2024deliberative,
  title={Deliberative alignment: Reasoning enables safer language models},
  author={Guan, Melody Y and Joglekar, Manas and Wallace, Eric and Jain, Saachi and Barak, Boaz and Heylar, Alec and Dias, Rachel and Vallone, Andrea and Ren, Hongyu and Wei, Jason and others},
  journal={arXiv preprint arXiv:2412.16339},
  year={2024}
}
# LLM as a Judge ####################################################################
@article{zheng2023judging,
  title={Judging llm-as-a-judge with mt-bench and chatbot arena},
  author={Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric and others},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={46595--46623},
  year={2023}
}
@article{lin2024wildbench,
  title={WILDBENCH: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
  author={Lin, Bill Yuchen and Deng, Yuntian and Chandu, Khyathi and Brahman, Faeze and Ravichander, Abhilasha and Pyatkin, Valentina and Dziri, Nouha and Bras, Ronan Le and Choi, Yejin},
  journal={arXiv preprint arXiv:2406.04770},
  year={2024}
}
@article{li2024crowdsourced,
  title={From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline},
  author={Li, Tianle and Chiang, Wei-Lin and Frick, Evan and Dunlap, Lisa and Wu, Tianhao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion},
  journal={arXiv preprint arXiv:2406.11939},
  year={2024}
}
@article{dubois2024length,
  title={Length-controlled alpacaeval: A simple way to debias automatic evaluators},
  author={Dubois, Yann and Galambosi, Bal{\'a}zs and Liang, Percy and Hashimoto, Tatsunori B},
  journal={arXiv preprint arXiv:2404.04475},
  year={2024}
}
@misc{ganguli2023,
  title        = {Collective Constitutional {AI}: {A}ligning a Language Model with Public Input},
  author       = {Ganguli, D. and others},
  year         = 2023,
  howpublished = {Anthropic},
  note         = {\url{https://www.anthropic.com/index/collective-constitutional-ai-aligning-a-language-model-with-public-input}, retrieved 2024-01-31}
}
@online{Anthropic2023ClaudesConstitution,
  author = {Anthropic},
  title = {Claude’s Constitution},
  year = {2023},
  url = {https://www.anthropic.com/news/claudes-constitution},
  note = {Accessed: 2024-02-07},
  urldate = {2024-02-07}
}
@misc{anthropic2024claude,
  author       = {{Anthropic}},
  title        = {Claude’s Character},
  year         = {2024},
  month        = {June},
  url          = {https://www.anthropic.com/research/claude-character},
  note         = {Accessed: 2025-04-16}
}
@article{lambert2024self,
  title={Self-directed synthetic dialogues and revisions technical report},
  author={Lambert, Nathan and Schoelkopf, Hailey and Gokaslan, Aaron and Soldaini, Luca and Pyatkin, Valentina and Castricato, Louis},
  journal={arXiv preprint arXiv:2407.18421},
  year={2024}
}
@inproceedings{sun2023principledriven,
title={Principle-Driven Self-Alignment of Language Models from Scratch with Minimal Human Supervision},
author={Zhiqing Sun and Yikang Shen and Qinhong Zhou and Hongxin Zhang and Zhenfang Chen and David Daniel Cox and Yiming Yang and Chuang Gan},
booktitle={Thirty-seventh Conference on Neural Information Processing Systems},
year={2023},
url={https://openreview.net/forum?id=p40XRfBX96}
}
@inproceedings{sun2024salmon,
title={{SALMON}: Self-Alignment with Principle-Following Reward Models},
author={Sun, Zhiqing and Shen, Yikang and Zhang, Hongxin and Zhou, Qinhong and Chen, Zhenfang and Cox, David and Yang, Yiming and Gan, Chuang},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=xJbsmB8UMx}
}
@article{Huang2024cai,
  author = {Huang, Shengyi and Tunstall, Lewis and Beeching, Edward and von Werra, Leandro and Sanseviero, Omar and Rasul, Kashif and Wolf, Thomas},
  title = {Constitutional AI Recipe},
  journal = {Hugging Face Blog},
  year = {2024},
  note = {\url{https://huggingface.co/blog/constitutional_ai}},
}

@article{wang2023shepherd,
  title={Shepherd: A critic for language model generation},
  author={Wang, Tianlu and Yu, Ping and Tan, Xiaoqing Ellen and O'Brien, Sean and Pasunuru, Ramakanth and Dwivedi-Yu, Jane and Golovneva, Olga and Zettlemoyer, Luke and Fazel-Zarandi, Maryam and Celikyilmaz, Asli},
  journal={arXiv preprint arXiv:2308.04592},
  year={2023}
}

@inproceedings{kim2023prometheus,
  title={Prometheus: Inducing fine-grained evaluation capability in language models},
  author={Kim, Seungone and Shin, Jamin and Cho, Yejin and Jang, Joel and Longpre, Shayne and Lee, Hwaran and Yun, Sangdoo and Shin, Seongjin and Kim, Sungdong and Thorne, James and others},
  booktitle={The Twelfth International Conference on Learning Representations},
  year={2023}
}
@article{kim2024prometheus,
  title={Prometheus 2: An open source language model specialized in evaluating other language models},
  author={Kim, Seungone and Suk, Juyoung and Longpre, Shayne and Lin, Bill Yuchen and Shin, Jamin and Welleck, Sean and Neubig, Graham and Lee, Moontae and Lee, Kyungjae and Seo, Minjoon},
  journal={arXiv preprint arXiv:2405.01535},
  year={2024}
}
@inproceedings{lee2024prometheus,
  title={Prometheus-vision: Vision-language model as a judge for fine-grained evaluation},
  author={Lee, Seongyun and Kim, Seungone and Park, Sue and Kim, Geewook and Seo, Minjoon},
  booktitle={Findings of the Association for Computational Linguistics ACL 2024},
  pages={11286--11315},
  year={2024}
}
@article{ke2023critiquellm,
  title={CritiqueLLM: Towards an informative critique generation model for evaluation of large language model generation},
  author={Ke, Pei and Wen, Bosi and Feng, Zhuoer and Liu, Xiao and Lei, Xuanyu and Cheng, Jiale and Wang, Shengyuan and Zeng, Aohan and Dong, Yuxiao and Wang, Hongning and others},
  journal={arXiv preprint arXiv:2311.18702},
  year={2023}
}
@article{li2023generative,
  title={Generative Judge for Evaluating Alignment},
  author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
  journal={arXiv preprint arXiv:2310.05470},
  year={2023}
}
@article{huang2024empirical,
  title={An empirical study of llm-as-a-judge for llm evaluation: Fine-tuned judge models are task-specific classifiers},
  author={Huang, Hui and Qu, Yingqi and Liu, Jing and Yang, Muyun and Zhao, Tiejun},
  journal={arXiv preprint arXiv:2403.02839},
  year={2024}
}
# More evals
@article{singh2024evaluation,
  title={Evaluation data contamination in LLMs: how do we measure it and (when) does it matter?},
  author={Singh, Aaditya K and Kocyigit, Muhammed Yusuf and Poulton, Andrew and Esiobu, David and Lomeli, Maria and Szilvasy, Gergely and Hupkes, Dieuwke},
  journal={arXiv preprint arXiv:2411.03923},
  year={2024}
}
@article{huang2025math,
  title={MATH-Perturb: Benchmarking LLMs' Math Reasoning Abilities against Hard Perturbations},
  author={Huang, Kaixuan and Guo, Jiacheng and Li, Zihao and Ji, Xiang and Ge, Jiawei and Li, Wenzhe and Guo, Yingqing and Cai, Tianle and Yuan, Hui and Wang, Runzhe and others},
  journal={arXiv preprint arXiv:2502.06453},
  year={2025}
}
@article{hendrycks2020measuring,
  title={Measuring massive multitask language understanding},
  author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
  journal={arXiv preprint arXiv:2009.03300},
  year={2020}
}
@article{mallen2023llm_memorization,
  title={When Not to Trust Language Models: Investigating Effectiveness and Limitations of Parametric and Non-Parametric Memories },
  author={ Mallen, Alex and Asai,Akari and  Zhong, Victor and Das, Rajarshi and Hajishirzi, Hannaneh and Khashabi, Daniel},
  journal={ arXiv preprint },
  year={ 2022 }
}
@article{lin2021truthfulqa,
  title={Truthfulqa: Measuring how models mimic human falsehoods},
  author={Lin, Stephanie and Hilton, Jacob and Evans, Owain},
  journal={arXiv preprint arXiv:2109.07958},
  year={2021}
}
@article{chen2021codex,
  title={Evaluating Large Language Models Trained on Code},
  author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
  year={2021},
  eprint={2107.03374},
  archivePrefix={arXiv},
  primaryClass={cs.LG}
}
@inproceedings{evalplus,
  title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
  author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
  year = {2023},
  url = {https://openreview.net/forum?id=1qvx610Cu7},
}
@article{cobbe2021gsm8k,
  title={Training Verifiers to Solve Math Word Problems},
  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
  journal={arXiv preprint arXiv:2110.14168},
  year={2021}
}
@article{hendrycksmath2021,
  title={Measuring Mathematical Problem Solving With the MATH Dataset},
  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
  journal={NeurIPS},
  year={2021}
}
@article{suzgun2022challenging,
  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
  author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason},
  journal={arXiv preprint arXiv:2210.09261},
  year={2022}
}
@article{dua2019drop,
  title={DROP: A reading comprehension benchmark requiring discrete reasoning over paragraphs},
  author={Dua, Dheeru and Wang, Yizhong and Dasigi, Pradeep and Stanovsky, Gabriel and Singh, Sameer and Gardner, Matt},
  journal={arXiv preprint arXiv:1903.00161},
  year={2019}
}
@misc{zhou2023instructionfollowingevaluationlargelanguage,
      title={Instruction-Following Evaluation for Large Language Models},
      author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
      year={2023},
      eprint={2311.07911},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2311.07911},
}
@misc{scale2024seal,
  author       = {Scale AI},
  title        = {SEAL LLM Leaderboards: Expert-Driven Private Evaluations},
  year         = {2024},
  url          = {https://scale.com/leaderboard},
  note         = {Accessed: 2025-04-10}
}
@article{rein2023gpqa,
  title={GPQA: A Graduate-Level Google-Proof Q\&A Benchmark},
  author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R.},
  journal={arXiv preprint arXiv:2311.12022},
  year={2023}
}

@article{phan2025hle,
  title={Humanity's Last Exam},
  author={Phan, Long and Gatti, Alice and Han, Ziwen and Li, Nathaniel and Zhang, Hugh et al.},
  journal={arXiv preprint arXiv:2501.14249},
  year={2025}
}

@article{aleithan2024swebenchplus,
  title={{SWE-Bench+}: Enhanced Coding Benchmark for LLMs},
  author={Aleithan, Reem and Xue, Haoran and Mohajer, Mohammad M. and Nnorom, Elijah and Uddin, Gias and Wang, Song},
  journal={arXiv preprint arXiv:2410.06992},
  year={2024}
}

@article{jain2024livecodebench,
  title={{LiveCodeBench}: Holistic and Contamination-Free Evaluation of Large Language Models for Code},
  author={Jain, Naman and Han, King and Gu, Alex and Li, Wen-Ding and Yan, Fanjia and Zhang, Tianjun and Wang, Sida and Solar-Lezama, Armando and Sen, Koushik and Stoica, Ion},
  journal={arXiv preprint arXiv:2403.07974},
  year={2024}
}
@inproceedings{robinson2023leveraging,
  title={Leveraging Large Language Models for Multiple Choice Question Answering},
  author={Robinson, Joshua and Rytting, Christopher Michael and Wingate, David},
  booktitle={International Conference on Learning Representations},
  year={2023},
  url={https://openreview.net/forum?id=upQ4o-ygvJ}
}
@inproceedings{wei2022finetuned,
  title={Finetuned Language Models Are Zero-Shot Learners},
  author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
  booktitle={International Conference on Learning Representations},
  year={2022}
}
@inproceedings{sanh2022multitask,
  title={Multitask Prompted Training Enables Zero-Shot Task Generalization},
  author={Sanh, Victor and Webson, Albert and Raffel, Colin and Bach, Stephen H and Sutawika, Lintang and Alyafeai, Zaid and Chaffin, Antoine and Stiegler, Arnaud and Le Scao, Teven and Raja, Arun and others},
  booktitle={International Conference on Learning Representations},
  year={2022}
}
@article{schulhoff2024prompt,
  title={The prompt report: A systematic survey of prompting techniques},
  author={Schulhoff, Sander and Ilie, Michael and Balepur, Nishant and Kahadze, Konstantine and Liu, Amanda and Si, Chenglei and Li, Yinheng and Gupta, Aayush and Han, HyoJung and Schulhoff, Sevien and others},
  journal={arXiv preprint arXiv:2406.06608},
  year={2024}
}
@article{kojima2022large,
  title={Large language models are zero-shot reasoners},
  author={Kojima, Takeshi and Gu, Shixiang Shane and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
  journal={Advances in neural information processing systems},
  volume={35},
  pages={22199--22213},
  year={2022}
}
@misc{openai2024swebench,
  title        = {Introducing SWE-bench Verified},
  author       = {{OpenAI}},
  year         = {2024},
  month        = aug,
  url          = {https://openai.com/index/introducing-swe-bench-verified/},
  note         = {Accessed: 2025-04-12}
}
# eval tools
@misc{inspectAI2024,
  author       = {{UK AI Safety Institute}},
  title        = {{Inspect AI: Framework for Large Language Model Evaluations}},
  year         = {2024},
  howpublished = {\url{https://github.com/UKGovernmentBEIS/inspect_ai}}
}
@misc{open-llm-leaderboard-v2,
  author = {Clémentine Fourrier and Nathan Habib and Alina Lozovskaya and Konrad Szafer and Thomas Wolf},
  title = {Open LLM Leaderboard v2},
  year = {2024},
  publisher = {Hugging Face},
  howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}",
}
@misc{fourrier2023lighteval,
  author       = {Fourrier, Clementine and Habib, Nathan and Kydlicek, Hynek and Wolf, Thomas and Tunstall, Lewis},
  title        = {{LightEval: A lightweight framework for LLM evaluation}},
  year         = {2023},
  howpublished = {\url{https://github.com/huggingface/lighteval}}
}
@misc{gao2023evalharness,
  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
  title        = {{A Framework for Few-Shot Language Model Evaluation}},
  year         = {2023},
  publisher    = {Zenodo},
  doi          = {10.5281/zenodo.10256836},
  url          = {https://zenodo.org/record/10256836}
}
@article{gu2024olmes,
  author    = {Gu, Yuling and Tafjord, Oyvind and Kuehl, Bailey and Haddad, Dany and Dodge, Jesse and Hajishirzi, Hannaneh},
  title     = {{OLMES: A Standard for Language Model Evaluations}},
  journal   = {arXiv preprint arXiv:2406.08446},
  year      = {2024}
}
@article{liang2023helm,
  author    = {Liang, Percy and Bommasani, Rishi and Lee, Tony and Tsipras, Dimitris and Soylu, Dilara and Yasunaga, Michihiro and Zhang, Yian and Narayanan, Deepak and Wu, Yuhuai and Kumar, Ananya and Newman, Benjamin and Yuan, Binhang and Yan, Bobby and Zhang, Ce and Cosgrove, Christian and Manning, Christopher D. and R\'e, Christopher and Acosta-Navas, Diana and Hudson, Drew A. and Zelikman, Eric and Durmus, Esin and Ladhak, Faisal and Rong, Frieda and Ren, Hongyu and Yao, Huaxiu and Wang, Jue and Santhanam, Keshav and Orr, Laurel J. and Zheng, Lucia and Y\'uksekg\"on\"ul, Mert and Suzgun, Mirac and Kim, Nathan and Guha, Neel and Chatterji, Niladri S. and Khattab, Omar and Henderson, Peter and Huang, Qian and Chi, Ryan and Xie, Sang Michael and Santurkar, Shibani and Ganguli, Surya and Hashimoto, Tatsunori and Icard, Thomas and Zhang, Tianyi and Chaudhary, Vishrav and Wang, William and Li, Xuechen and Mai, Yifan and Zhang, Yuhui and Koreeda, Yuta},
  title     = {{Holistic Evaluation of Language Models}},
  journal   = {Transactions on Machine Learning Research},
  year      = {2023},
  doi       = {10.1111/nyas.15007},
  note      = {Also available as arXiv:2211.09110}
}
@misc{mosaicml2024gauntlet,
  author       = {{MosaicML}},
  title        = {{Mosaic Eval Gauntlet v0.3.0 \textemdash{} Evaluation Suite}},
  year         = {2024},
  howpublished = {\url{https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/EVAL_GAUNTLET.md}},
  note         = {Accessed 2024-05-29}
}
@article{lewkowycz2022solving,
  title={Solving quantitative reasoning problems with language models},
  author={Lewkowycz, Aitor and Andreassen, Anders and Dohan, David and Dyer, Ethan and Michalewski, Henryk and Ramasesh, Vinay and Slone, Ambrose and Anil, Cem and Schlag, Imanol and Gutman-Solo, Theo and others},
  journal={Advances in Neural Information Processing Systems},
  volume={35},
  pages={3843--3857},
  year={2022}
}
# Misc Blogs / talks #############################################################################
@misc{schulman2016klapprox,
  author = {Schulman, John},
  title = {Approximating KL-divergence},
  year = {2016},
  howpublished = {\url{http://joschu.net/blog/kl-approx.html}},
  note = {Accessed: 2024-10-01}
}
@misc{openai2022chatgpt,
  title = {ChatGPT: Optimizing Language Models for Dialogue},
  author = {{OpenAI}},
  year = {2022},
  howpublished = {\url{https://openai.com/blog/chatgpt/}},
  note = {Training a LM with RLHF for suitable use as an all-purpose chat bot.}
}
@misc{schulman2023proxy,
  author       = {John Schulman},
  title        = {Proxy Objectives in Reinforcement Learning from Human Feedback},
  howpublished = {Invited talk at the International Conference on Machine Learning (ICML)},
  year         = 2023,
  url          = {https://icml.cc/virtual/2023/invited-talk/21549}
}
@book{goodhart1984problems,
  title={Problems of monetary management: the UK experience},
  author={Goodhart, Charles AE and Goodhart, CAE},
  year={1984},
  publisher={Springer}
}
@misc{rlblogpost,
    title={Deep Reinforcement Learning Doesn't Work Yet},
    author={Irpan, Alex},
    howpublished={\url{https://www.alexirpan.com/2018/02/14/rl-hard.html}},
    year={2018}
}

#Social choice #######################################################################################
@article{conitzer2024social,
  title={Social Choice Should Guide AI Alignment in Dealing with Diverse Human Feedback},
  author={Conitzer, Vincent and Freedman, Rachel and Heitzig, Jobst and Holliday, Wesley H and Jacobs, Bob M and Lambert, Nathan and Moss{\'e}, Milan and Pacuit, Eric and Russell, Stuart and Schoelkopf, Hailey and others},
  journal={arXiv preprint arXiv:2404.10271},
  year={2024}
}
@article{mishra2023ai,
  title={Ai alignment and social choice: Fundamental limitations and policy implications},
  author={Mishra, Abhilash},
  journal={arXiv preprint arXiv:2310.16048},
  year={2023}
}
@article{kirk2024prism,
  title={The PRISM Alignment Project: What Participatory, Representative and Individualised Human Feedback Reveals About the Subjective and Multicultural Alignment of Large Language Models},
  author={Kirk, Hannah Rose and Whitefield, Alexander and R{\"o}ttger, Paul and Bean, Andrew and Margatina, Katerina and Ciro, Juan and Mosquera, Rafael and Bartolo, Max and Williams, Adina and He, He and others},
  journal={arXiv preprint arXiv:2404.16019},
  year={2024}
}
@article{poddar2024personalizing,
  title={Personalizing reinforcement learning from human feedback with variational preference learning},
  author={Poddar, Sriyash and Wan, Yanming and Ivison, Hamish and Gupta, Abhishek and Jaques, Natasha},
  journal={arXiv preprint arXiv:2408.10075},
  year={2024}
}
# Models & More ########################################################################
@article{brown2020language,
  title={Language Models are Few-Shot Learners},
  author={Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal={arXiv preprint arXiv:2005.14165},
  year={2020}
}
@article{qwen,
  title={Qwen Technical Report},
  author={Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu},
  journal={arXiv preprint arXiv:2309.16609},
  year={2023}
}

@article{achiam2023gpt,
  title={Gpt-4 technical report},
  author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
  journal={arXiv preprint arXiv:2303.08774},
  year={2023}
}
@inproceedings{gpt-neox-20b,
  title={{GPT-NeoX-20B}: An Open-Source Autoregressive Language Model},
  author={Black, Sid and Biderman, Stella and Hallahan, Eric and Anthony, Quentin and Gao, Leo and Golding, Laurence and He, Horace and Leahy, Connor and McDonell, Kyle and Phang, Jason and Pieler, Michael and Prashanth, USVSN Sai and Purohit, Shivanshu and Reynolds, Laria and Tow, Jonathan and Wang, Ben and Weinbach, Samuel},
  booktitle={Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models},
  url={https://arxiv.org/abs/2204.06745},
  year={2022}
}
@misc{ai2_olmoe_ios_2025,
  author = {{Allen Institute for Artificial Intelligence}},
  title = {OLMoE, meet iOS},
  howpublished = {\url{https://allenai.org/blog/olmoe-app}},
  year = {2025},
  month = {February},
  day = {11}
}
@article{team2024gemma,
  title={Gemma 2: Improving open language models at a practical size},
  author={Team, Gemma and Riviere, Morgane and Pathak, Shreya and Sessa, Pier Giuseppe and Hardin, Cassidy and Bhupatiraju, Surya and Hussenot, L{\'e}onard and Mesnard, Thomas and Shahriari, Bobak and Ram{\'e}, Alexandre and others},
  journal={arXiv preprint arXiv:2408.00118},
  year={2024}
}
# ML other
@article{zhu2024distilling,
  title={Distilling mathematical reasoning capabilities into small language models},
  author={Zhu, Xunyu and Li, Jian and Liu, Yong and Ma, Can and Wang, Weiping},
  journal={Neural Networks},
  volume={179},
  pages={106594},
  year={2024},
  publisher={Elsevier}
}
@article{muennighoff2025s1,
  title={s1: Simple test-time scaling},
  author={Muennighoff, Niklas and Yang, Zitong and Shi, Weijia and Li, Xiang Lisa and Fei-Fei, Li and Hajishirzi, Hannaneh and Zettlemoyer, Luke and Liang, Percy and Cand{\`e}s, Emmanuel and Hashimoto, Tatsunori},
  journal={arXiv preprint arXiv:2501.19393},
  year={2025}
}
@article{shridhar2023distilling,
  title={Distilling reasoning capabilities into smaller language models},
  author={Shridhar, Kumar and Stolfo, Alessandro and Sachan, Mrinmaya},
  journal={Findings of the Association for Computational Linguistics: ACL 2023},
  pages={7059--7073},
  year={2023},
  publisher={Association for Computational Linguistics}
}
@article{hsieh2023distilling,
  title={Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes},
  author={Hsieh, Cheng-Yu and Li, Chun-Liang and Yeh, Chih-Kuan and Nakhost, Hootan and Fujii, Yasuhisa and Ratner, Alexander and Krishna, Ranjay and Lee, Chen-Yu and Pfister, Tomas},
  journal={arXiv preprint arXiv:2305.02301},
  year={2023}
}
@article{li2024numinamath,
  title={Numinamath: The largest public dataset in ai4maths with 860k pairs of competition math problems and solutions},
  author={Li, Jia and Beeching, Edward and Tunstall, Lewis and Lipkin, Ben and Soletskyi, Roman and Huang, Shengyi and Rasul, Kashif and Yu, Longhui and Jiang, Albert Q and Shen, Ziju and others},
  journal={Hugging Face repository},
  volume={13},
  pages={9},
  year={2024}
}
@article{yu2023metamath,
  title={Metamath: Bootstrap your own mathematical questions for large language models},
  author={Yu, Longhui and Jiang, Weisen and Shi, Han and Yu, Jincheng and Liu, Zhengying and Zhang, Yu and Kwok, James T and Li, Zhenguo and Weller, Adrian and Liu, Weiyang},
  journal={arXiv preprint arXiv:2309.12284},
  year={2023}
}
@inproceedings{agarwal2024policy,
  title={On-policy distillation of language models: Learning from self-generated mistakes},
  author={Agarwal, Rishabh and Vieillard, Nino and Zhou, Yongchao and Stanczyk, Piotr and Garea, Sabela Ramos and Geist, Matthieu and Bachem, Olivier},
  booktitle={The Twelfth International Conference on Learning Representations},
  year={2024}
}
@article{hinton2015distilling,
  title={Distilling the knowledge in a neural network},
  author={Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  journal={arXiv preprint arXiv:1503.02531},
  year={2015}
}
@article{wei2022chain,
  title={Chain-of-thought prompting elicits reasoning in large language models},
  author={Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Xia, Fei and Chi, Ed and Le, Quoc V and Zhou, Denny and others},
  journal={Advances in neural information processing systems},
  volume={35},
  pages={24824--24837},
  year={2022}
}
@article{zhuang2020consequences,
  title={Consequences of misaligned AI},
  author={Zhuang, Simon and Hadfield-Menell, Dylan},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  pages={15763--15773},
  year={2020}
}
@article{sharma2023towards,
  title={Towards understanding sycophancy in language models},
  author={Sharma, Mrinank and Tong, Meg and Korbak, Tomasz and Duvenaud, David and Askell, Amanda and Bowman, Samuel R and Cheng, Newton and Durmus, Esin and Hatfield-Dodds, Zac and Johnston, Scott R and others},
  journal={arXiv preprint arXiv:2310.13548},
  year={2023}
}
@article{gerstgrasser2024model,
  title={Is model collapse inevitable? breaking the curse of recursion by accumulating real and synthetic data},
  author={Gerstgrasser, Matthias and Schaeffer, Rylan and Dey, Apratim and Rafailov, Rafael and Sleight, Henry and Hughes, John and Korbak, Tomasz and Agrawal, Rajashree and Pai, Dhruv and Gromov, Andrey and others},
  journal={arXiv preprint arXiv:2404.01413},
  year={2024}
}
@inproceedings{feng2024beyond,
  title={Beyond model collapse: Scaling up with synthesized data requires reinforcement},
  author={Feng, Yunzhen and Dohmatob, Elvis and Yang, Pu and Charton, Francois and Kempe, Julia},
  booktitle={ICML 2024 Workshop on Theoretical Foundations of Foundation Models},
  year={2024}
}
@article{shumailov2024ai,
  title={AI models collapse when trained on recursively generated data},
  author={Shumailov, Ilia and Shumaylov, Zakhar and Zhao, Yiren and Papernot, Nicolas and Anderson, Ross and Gal, Yarin},
  journal={Nature},
  volume={631},
  number={8022},
  pages={755--759},
  year={2024},
  publisher={Nature Publishing Group UK London}
}
@article{li2024superfiltering,
  title={Superfiltering: Weak-to-strong data filtering for fast instruction-tuning},
  author={Li, Ming and Zhang, Yong and He, Shwai and Li, Zhitao and Zhao, Hongyu and Wang, Jianzong and Cheng, Ning and Zhou, Tianyi},
  journal={arXiv preprint arXiv:2402.00530},
  year={2024}
}
